In [2]:
!pip install wordcloud
Collecting wordcloud
  Downloading https://files.pythonhosted.org/packages/ae/af/849edf14d573eba9c8082db898ff0d090428d9485371cc4fe21a66717ad2/wordcloud-1.5.0-cp36-cp36m-manylinux1_x86_64.whl (361kB)
    100% |████████████████████████████████| 368kB 35.2MB/s 
Requirement already satisfied: numpy>=1.6.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from wordcloud) (1.14.5)
Requirement already satisfied: pillow in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from wordcloud) (5.2.0)
Installing collected packages: wordcloud
Successfully installed wordcloud-1.5.0
You are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
In [3]:
import pandas as pd
import numpy as np
import scipy.stats as scs
import statsmodels.api as sm
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

%matplotlib inline
%config InlineBackend.figure_format='retina'
In [4]:
df = pd.read_csv('small_descr_clm_code.csv')
df.drop('Unnamed: 0',axis=1, inplace=True)
df.head()
Out[4]:
descr clm code
0 CROSS-REFERENCE TO RELATED APPLICATIONS \n ... 1. A computer-implemented method of designing ... 706
1 RELATED APPLICATIONS \n This application i... What is claimed is: \n \n 1 . A sy... 705
2 CROSS REFERENCE TO RELATED APPLICATION \n ... 1. A weather information display device compri... 706
3 TECHNICAL FIELD \n The present disclosure ... 1 . A method of obtaining a user's measure... 705
4 CROSS-REFERENCE TO RELATED APPLICATIONS \n ... 1 . A method for providing borrower foreclosur... 705
In [5]:
df['descr_clm'] = df.descr + df.clm
df.drop(['descr','clm'],axis=1, inplace=True)
df['code'] = df['code'].astype('category')
In [6]:
df.head()
Out[6]:
code descr_clm
0 706 CROSS-REFERENCE TO RELATED APPLICATIONS \n ...
1 705 RELATED APPLICATIONS \n This application i...
2 706 CROSS REFERENCE TO RELATED APPLICATION \n ...
3 705 TECHNICAL FIELD \n The present disclosure ...
4 705 CROSS-REFERENCE TO RELATED APPLICATIONS \n ...

Word Cloud for 705

In [7]:
df_705 = df[df['code']==705]
In [8]:
df_705.head()
Out[8]:
code descr_clm
1 705 RELATED APPLICATIONS \n This application i...
3 705 TECHNICAL FIELD \n The present disclosure ...
4 705 CROSS-REFERENCE TO RELATED APPLICATIONS \n ...
5 705 CROSS REFERENCE TO OTHER APPLICATIONS \n T...
6 705 CROSS-REFERENCE TO RELATED APPLICATIONS \n ...
In [18]:
custom_stopword_list = ['wherein','subject matter','subject','matter','first','second','include','includes','comprise','said','disclosed','presently disclosed','system','process','method','one', 'may','claim','embodiment','invention','include', 'example', 'include','step','figure','fig']

Add custom list of words to stop words

stopwords is a set so to add a list of words into that set I can use set.add() for an element but that didn't work for a list. I used set |= set(list) . it is a Union function.

In [19]:
stopwords = STOPWORDS
stopwords |= set(custom_stopword_list)
In [20]:
text = df_705.descr_clm.values

wordcloud_705 = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = stopwords).generate(str(text))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud_705, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

Word Cloud for 706

In [21]:
df_706 = df[df['code']==706]
In [22]:
df_706.head()
Out[22]:
code descr_clm
0 706 CROSS-REFERENCE TO RELATED APPLICATIONS \n ...
2 706 CROSS REFERENCE TO RELATED APPLICATION \n ...
11 706 BACKGROUND \n Organizations are typically ...
22 706 CROSS-REFERENCE TO RELATED APPLICATION \n ...
23 706 RELATED APPLICATIONS \n The present applic...
In [23]:
text = df_706.descr_clm.values

wordcloud_706 = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = stopwords).generate(str(text))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud_706, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
In [ ]: